Análisis Descriptivo
# librerías necesarias para implementar las funciones
library(readxl)
library(glue)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggmosaic)
library(ggridges)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ data.table::first() masks dplyr::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ data.table::last() masks dplyr::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following object is masked from 'package:tidyr':
##
## extract
##
## The following objects are masked from 'package:data.table':
##
## first, last
##
## The following objects are masked from 'package:dplyr':
##
## first, last
library(xtable)
library(here)
## here() starts at /Users/sofiabocker/Desktop/universidad/UCR/Actuariales/Cuarto año/I Ciclo/Estadística Actuarial I/Proyecto/cod
library(skimr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(rcompanion)
library(RCurl)
##
## Attaching package: 'RCurl'
##
## The following object is masked from 'package:tidyr':
##
## complete
# importar base de datos
# Link de la base de datos en github
url <- "https://raw.githubusercontent.com/sofiabocker/proyecto_ca_0303_g08/main/base_datos_alcohol.xlsx"
# descargar el archivo
binary_data <- getBinaryURL(url)
temp_file <- tempfile(fileext = ".xlsx")
writeBin(binary_data, temp_file)
# leer el archivo en R
base_datos <- read_excel(temp_file)
## New names:
## • `` -> `...32`
## • `` -> `...33`
base_datos <- base_datos [, -32]
base_datos <- base_datos [, -32]
base_datos <- head(base_datos, -25)
# Comprimir las variables de 5 categorías en variables de tres categorías <
base_datos_clean <- base_datos %>%
clean_names() %>%
mutate(alcohol_weekdays = fct_collapse(
alcohol_weekdays,
Low = c("Low", "Very Low"),
High = c("High", "Very High"),
Moderate = "Moderate"
))
# Asegurarse que los datos se mantengan como characters
base_datos_clean$alcohol_weekdays <- as.character(base_datos_clean$alcohol_weekdays)
base_datos_clean <- base_datos_clean %>%
clean_names() %>%
mutate(alcohol_weekends = fct_collapse(
alcohol_weekends,
Low = c("Low", "Very Low"),
High = c("High", "Very High"),
Moderate = "Moderate"
))
base_datos_clean$alcohol_weekends <- as.character(base_datos_clean$alcohol_weekends)
base_datos_clean <- base_datos_clean %>%
clean_names() %>%
mutate(health_status = fct_collapse(
health_status ,
Poor = c("Poor", "Very Poor"),
Good = c("Very Good", "Good"),
Fair = "Fair"
))
base_datos_clean$health_status <- as.character(base_datos_clean$health_status)
base_datos_clean <- base_datos_clean %>%
clean_names() %>%
mutate(good_family_relationship = fct_collapse(
good_family_relationship,
Poor = c("Poor", "Very Poor"),
Good = c("Excellent", "Good"),
Fair = "Fair"
))
base_datos_clean$good_family_relationship <- as.character(base_datos_clean$good_family_relationship)
base_datos_clean <- base_datos_clean %>%
clean_names() %>%
mutate(free_time_after_school = fct_collapse(
free_time_after_school,
Low = c("Low", "Very Low"),
High = c("High", "Very High"),
Moderate = "Moderate"
))
base_datos_clean$free_time_after_school <- as.character(base_datos_clean$free_time_after_school)
base_datos_clean <- base_datos_clean %>%
clean_names() %>%
mutate(time_with_friends = fct_collapse(
time_with_friends,
Low = c("Low", "Very Low"),
High = c("High", "Very High"),
Moderate = "Moderate"
))
base_datos_clean$time_with_friends <- as.character(base_datos_clean$time_with_friends)
# muestra la estructura de los datos
str <- str(base_datos_clean)
## tibble [649 × 31] (S3: tbl_df/tbl/data.frame)
## $ school : chr [1:649] "Gabriel Pereira" "Gabriel Pereira" "Gabriel Pereira" "Gabriel Pereira" ...
## $ gender : chr [1:649] "Female" "Female" "Female" "Female" ...
## $ age : num [1:649] 18 17 15 15 16 16 16 17 15 15 ...
## $ housing_type : chr [1:649] "Urban" "Urban" "Urban" "Urban" ...
## $ family_size : chr [1:649] "Above 3" "Above 3" "Up to 3" "Above 3" ...
## $ parental_status : chr [1:649] "Separated" "Living Together" "Living Together" "Living Together" ...
## $ mother_education : chr [1:649] "Higher Education" "Primary School" "Primary School" "Higher Education" ...
## $ father_education : chr [1:649] "Higher Education" "Primary School" "Primary School" "Lower Secondary School" ...
## $ mother_work : chr [1:649] "Homemaker" "Homemaker" "Homemaker" "Health" ...
## $ father_work : chr [1:649] "Teacher" "other" "other" "Services" ...
## $ reason_school_choice : chr [1:649] "Course Preference" "Course Preference" "Other" "Near Home" ...
## $ legal_responsibility : chr [1:649] "Mother" "Father" "Mother" "Mother" ...
## $ commute_time : chr [1:649] "15 to 30 min" "Up to 15 min" "Up to 15 min" "Up to 15 min" ...
## $ weekly_study_time : chr [1:649] "2 to 5h" "2 to 5h" "2 to 5h" "5 to 10h" ...
## $ extra_educational_support : chr [1:649] "Yes" "No" "Yes" "No" ...
## $ parental_educational_support: chr [1:649] "No" "Yes" "No" "Yes" ...
## $ private_tutoring : chr [1:649] "No" "No" "No" "No" ...
## $ extracurricular_activities : chr [1:649] "No" "No" "No" "Yes" ...
## $ attended_daycare : chr [1:649] "Yes" "No" "Yes" "Yes" ...
## $ desire_graduate_education : chr [1:649] "Yes" "Yes" "Yes" "Yes" ...
## $ has_internet : chr [1:649] "No" "Yes" "Yes" "Yes" ...
## $ is_dating : chr [1:649] "No" "No" "No" "Yes" ...
## $ good_family_relationship : chr [1:649] "Good" "Good" "Good" "Fair" ...
## $ free_time_after_school : chr [1:649] "Moderate" "Moderate" "Moderate" "Low" ...
## $ time_with_friends : chr [1:649] "High" "Moderate" "Low" "Low" ...
## $ alcohol_weekdays : chr [1:649] "Low" "Low" "Low" "Low" ...
## $ alcohol_weekends : chr [1:649] "Low" "Low" "Moderate" "Low" ...
## $ health_status : chr [1:649] "Fair" "Fair" "Fair" "Good" ...
## $ school_absence : num [1:649] 4 2 6 0 0 6 0 2 0 0 ...
## $ grade_1st_semester : num [1:649] 0 9 12 14 11 12 13 10 15 12 ...
## $ grade_2nd_semester : num [1:649] 11 11 13 14 13 12 12 13 16 12 ...
# resumen general de la base de datos
summary(base_datos_clean)
## school gender age housing_type
## Length:649 Length:649 Min. :15.00 Length:649
## Class :character Class :character 1st Qu.:16.00 Class :character
## Mode :character Mode :character Median :17.00 Mode :character
## Mean :16.74
## 3rd Qu.:18.00
## Max. :22.00
## family_size parental_status mother_education father_education
## Length:649 Length:649 Length:649 Length:649
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## mother_work father_work reason_school_choice
## Length:649 Length:649 Length:649
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## legal_responsibility commute_time weekly_study_time
## Length:649 Length:649 Length:649
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## extra_educational_support parental_educational_support private_tutoring
## Length:649 Length:649 Length:649
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## extracurricular_activities attended_daycare desire_graduate_education
## Length:649 Length:649 Length:649
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## has_internet is_dating good_family_relationship
## Length:649 Length:649 Length:649
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## free_time_after_school time_with_friends alcohol_weekdays
## Length:649 Length:649 Length:649
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## alcohol_weekends health_status school_absence grade_1st_semester
## Length:649 Length:649 Min. : 0.000 Min. : 0.0
## Class :character Class :character 1st Qu.: 0.000 1st Qu.:10.0
## Mode :character Mode :character Median : 2.000 Median :11.0
## Mean : 3.659 Mean :11.4
## 3rd Qu.: 6.000 3rd Qu.:13.0
## Max. :32.000 Max. :19.0
## grade_2nd_semester
## Min. : 0.00
## 1st Qu.:10.00
## Median :11.00
## Mean :11.57
## 3rd Qu.:13.00
## Max. :19.00
# explora data
skimr::skim(base_datos_clean)
Data summary
| Name |
base_datos_clean |
| Number of rows |
649 |
| Number of columns |
31 |
| _______________________ |
|
| Column type frequency: |
|
| character |
27 |
| numeric |
4 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| school |
0 |
1 |
15 |
20 |
0 |
2 |
0 |
| gender |
0 |
1 |
4 |
6 |
0 |
2 |
0 |
| housing_type |
0 |
1 |
5 |
5 |
0 |
2 |
0 |
| family_size |
0 |
1 |
7 |
7 |
0 |
2 |
0 |
| parental_status |
0 |
1 |
9 |
15 |
0 |
2 |
0 |
| mother_education |
0 |
1 |
4 |
22 |
0 |
5 |
0 |
| father_education |
0 |
1 |
4 |
22 |
0 |
5 |
0 |
| mother_work |
0 |
1 |
5 |
9 |
0 |
5 |
0 |
| father_work |
0 |
1 |
5 |
9 |
0 |
5 |
0 |
| reason_school_choice |
0 |
1 |
5 |
17 |
0 |
4 |
0 |
| legal_responsibility |
0 |
1 |
5 |
6 |
0 |
3 |
0 |
| commute_time |
0 |
1 |
12 |
12 |
0 |
4 |
0 |
| weekly_study_time |
0 |
1 |
7 |
13 |
0 |
4 |
0 |
| extra_educational_support |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| parental_educational_support |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| private_tutoring |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| extracurricular_activities |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| attended_daycare |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| desire_graduate_education |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| has_internet |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| is_dating |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| good_family_relationship |
0 |
1 |
4 |
4 |
0 |
3 |
0 |
| free_time_after_school |
0 |
1 |
3 |
8 |
0 |
3 |
0 |
| time_with_friends |
0 |
1 |
3 |
8 |
0 |
3 |
0 |
| alcohol_weekdays |
0 |
1 |
3 |
8 |
0 |
3 |
0 |
| alcohol_weekends |
0 |
1 |
3 |
8 |
0 |
3 |
0 |
| health_status |
0 |
1 |
4 |
4 |
0 |
3 |
0 |
Variable type: numeric
| age |
0 |
1 |
16.74 |
1.22 |
15 |
16 |
17 |
18 |
22 |
▇▅▅▁▁ |
| school_absence |
0 |
1 |
3.66 |
4.64 |
0 |
0 |
2 |
6 |
32 |
▇▂▁▁▁ |
| grade_1st_semester |
0 |
1 |
11.40 |
2.75 |
0 |
10 |
11 |
13 |
19 |
▁▂▇▇▁ |
| grade_2nd_semester |
0 |
1 |
11.57 |
2.91 |
0 |
10 |
11 |
13 |
19 |
▁▁▇▇▂ |
Variables cuantitativas
# crear un dataframe con sólo las columnas con valores numéricos
base_datos_num <- base_datos_clean %>% select_if(is.numeric)
base_datos_num
## # A tibble: 649 × 4
## age school_absence grade_1st_semester grade_2nd_semester
## <dbl> <dbl> <dbl> <dbl>
## 1 18 4 0 11
## 2 17 2 9 11
## 3 15 6 12 13
## 4 15 0 14 14
## 5 16 0 11 13
## 6 16 6 12 12
## 7 16 0 13 12
## 8 17 2 10 13
## 9 15 0 15 16
## 10 15 0 12 12
## # ℹ 639 more rows
Estadísticas más Específicas
# brinda estadísticas más específicas
estadisticas <- stat.desc(base_datos_num)
estadisticas
## age school_absence grade_1st_semester grade_2nd_semester
## nbr.val 6.490000e+02 649.0000000 649.0000000 649.0000000
## nbr.null 0.000000e+00 244.0000000 1.0000000 7.0000000
## nbr.na 0.000000e+00 0.0000000 0.0000000 0.0000000
## min 1.500000e+01 0.0000000 0.0000000 0.0000000
## max 2.200000e+01 32.0000000 19.0000000 19.0000000
## range 7.000000e+00 32.0000000 19.0000000 19.0000000
## sum 1.086700e+04 2375.0000000 7398.0000000 7509.0000000
## median 1.700000e+01 2.0000000 11.0000000 11.0000000
## mean 1.674422e+01 3.6594761 11.3990755 11.5701079
## SE.mean 4.781608e-02 0.1821657 0.1077611 0.1143703
## CI.mean.0.95 9.389318e-02 0.3577064 0.2116031 0.2245812
## var 1.483859e+00 21.5366423 7.5364806 8.4892903
## std.dev 1.218138e+00 4.6407588 2.7452651 2.9136387
## coef.var 7.274973e-02 1.2681484 0.2408323 0.2518247
Histogramas
# crea un histograma para cada columna cuantitativa
lapply(names(base_datos_num), function(col_name) {
col <- base_datos_num[[col_name]]
ggplot(data.frame(col), aes(x = col)) +
geom_histogram(binwidth = 1, fill = "blue") +
labs(title = col_name, x = col_name, y = "Frequencia")
})
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

Densidad
# crea un gráfico de densidad para cada columna cuantitativa
lapply(names(base_datos_num), function(col_name) {
col <- base_datos_num[[col_name]]
ggplot(data.frame(col), aes(x = col)) +
geom_density() +
labs(x = col_name)
})
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

Gráficos de Barra
# crear gráficos de barra para cada columna cuantitativa
lapply(names(base_datos_num), function(col_name) {
col <- base_datos_num[[col_name]]
ggplot(data.frame(col), aes(x = col)) +
geom_bar(stat = "count", fill = "darkred") +
labs(title = col_name, x = col_name, y = "")
})
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

Variables cualitativas
# crear un dataframe con sólo las columnas de string
base_datos_str <- base_datos_clean %>% select_if(is.character)
base_datos_str
## # A tibble: 649 × 27
## school gender housing_type family_size parental_status mother_education
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Gabriel Per… Female Urban Above 3 Separated Higher Education
## 2 Gabriel Per… Female Urban Above 3 Living Together Primary School
## 3 Gabriel Per… Female Urban Up to 3 Living Together Primary School
## 4 Gabriel Per… Female Urban Above 3 Living Together Higher Education
## 5 Gabriel Per… Female Urban Above 3 Living Together High School
## 6 Gabriel Per… Male Urban Up to 3 Living Together Higher Education
## 7 Gabriel Per… Male Urban Up to 3 Living Together Lower Secondary…
## 8 Gabriel Per… Female Urban Above 3 Separated Higher Education
## 9 Gabriel Per… Male Urban Up to 3 Separated High School
## 10 Gabriel Per… Male Urban Above 3 Living Together High School
## # ℹ 639 more rows
## # ℹ 21 more variables: father_education <chr>, mother_work <chr>,
## # father_work <chr>, reason_school_choice <chr>, legal_responsibility <chr>,
## # commute_time <chr>, weekly_study_time <chr>,
## # extra_educational_support <chr>, parental_educational_support <chr>,
## # private_tutoring <chr>, extracurricular_activities <chr>,
## # attended_daycare <chr>, desire_graduate_education <chr>, …
Gráficos de barra
# crear gráficos de barra para cada columna cualitativa
lapply(names(base_datos_str), function(col_name) {
col <- base_datos_str[[col_name]]
ggplot(data.frame(col), aes(x = col)) +
geom_bar(stat = "count", fill = "darkred") +
labs(title = col_name, x = col_name, y = "")
})
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

##
## [[8]]

##
## [[9]]

##
## [[10]]

##
## [[11]]

##
## [[12]]

##
## [[13]]

##
## [[14]]

##
## [[15]]

##
## [[16]]

##
## [[17]]

##
## [[18]]

##
## [[19]]

##
## [[20]]

##
## [[21]]

##
## [[22]]

##
## [[23]]

##
## [[24]]

##
## [[25]]

##
## [[26]]

##
## [[27]]

Covariaciones
Variables cualitativas y cuantitativas
# Relaciona la nota del primer semestre con la cantidad de alcohol consumida entre semana
ggplot(base_datos_clean, aes(x = grade_1st_semester, y = alcohol_weekdays, group = alcohol_weekdays)) +
geom_density_ridges()
## Picking joint bandwidth of 0.747

# Relaciona la nota del primer semestre con la cantidad de alcohol consumida en fin de semana
ggplot(base_datos_clean, aes(x = grade_1st_semester, y = alcohol_weekends, group = alcohol_weekends)) +
geom_density_ridges()
## Picking joint bandwidth of 0.822

# Relaciona la nota del segundo semestre con la cantidad de alcohol consumida entre semana
ggplot(base_datos_clean, aes(x = grade_2nd_semester, y = alcohol_weekdays, group = alcohol_weekdays)) +
geom_density_ridges()
## Picking joint bandwidth of 0.678

# Relaciona la nota del segundo semestre con la cantidad de alcohol consumida en fin de semana
ggplot(base_datos_clean, aes(x = grade_2nd_semester, y = alcohol_weekends, group = alcohol_weekends)) +
geom_density_ridges()
## Picking joint bandwidth of 0.856

# crear un mapa de calor
create_heatmap <- function(col_name) {
count_data <- base_datos_str %>% count(alcohol_weekdays, !!sym(col_name))
ggplot(count_data, aes(x = alcohol_weekdays, y = !!sym(col_name))) +
geom_tile(aes(fill = n), color = "white") +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = paste("Comparación de alcohol entre semana con", col_name),
x = "Alcohol entre semana", y = col_name)
}
# aplicar la función a tods las columnas
heatmap_plots <- lapply(names(base_datos_str)[-which(names(base_datos_str) == "alcohol_weekdays")], create_heatmap)
print(heatmap_plots)
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

##
## [[8]]

##
## [[9]]

##
## [[10]]

##
## [[11]]

##
## [[12]]

##
## [[13]]

##
## [[14]]

##
## [[15]]

##
## [[16]]

##
## [[17]]

##
## [[18]]

##
## [[19]]

##
## [[20]]

##
## [[21]]

##
## [[22]]

##
## [[23]]

##
## [[24]]

##
## [[25]]

##
## [[26]]

# crear un mapa de calor
create_heatmap <- function(col_name) {
count_data <- base_datos_str %>% count(alcohol_weekends, !!sym(col_name))
ggplot(count_data, aes(x = alcohol_weekends, y = !!sym(col_name))) +
geom_tile(aes(fill = n), color = "white") +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = paste("Comparación de alcohol en fin de semana con", col_name),
x = "Alcohol en fin de semana", y = col_name)
}
# aplicar la unción a todas las columnas
heatmap_plots <- lapply(names(base_datos_str)[-which(names(base_datos_str) == "alcohol_weekends")], create_heatmap)
print(heatmap_plots)
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

##
## [[8]]

##
## [[9]]

##
## [[10]]

##
## [[11]]

##
## [[12]]

##
## [[13]]

##
## [[14]]

##
## [[15]]

##
## [[16]]

##
## [[17]]

##
## [[18]]

##
## [[19]]

##
## [[20]]

##
## [[21]]

##
## [[22]]

##
## [[23]]

##
## [[24]]

##
## [[25]]

##
## [[26]]

Prototipo de Modelación
Tablas de contingencia Alcohol entre semana
# Crear tablas de contingencia para cada columna cualitativa y la de cantidad de alcohol entre semana
tablas_contingencias_1 <- lapply(base_datos_str, function(col) {
table(col, base_datos_str$alcohol_weekdays)
})
print(tablas_contingencias_1)
## $school
##
## col High Low Moderate
## Gabriel Pereira 22 379 22
## Mousinho da Silveira 12 193 21
##
## $gender
##
## col High Low Moderate
## Female 9 363 11
## Male 25 209 32
##
## $housing_type
##
## col High Low Moderate
## Rural 10 168 19
## Urban 24 404 24
##
## $family_size
##
## col High Low Moderate
## Above 3 23 408 26
## Up to 3 11 164 17
##
## $parental_status
##
## col High Low Moderate
## Living Together 31 501 37
## Separated 3 71 6
##
## $mother_education
##
## col High Low Moderate
## High School 8 118 13
## Higher Education 9 155 11
## Lower Secondary School 7 173 6
## None 0 5 1
## Primary School 10 121 12
##
## $father_education
##
## col High Low Moderate
## High School 5 117 9
## Higher Education 7 110 11
## Lower Secondary School 11 188 10
## None 0 7 0
## Primary School 11 150 13
##
## $mother_work
##
## col High Low Moderate
## Health 0 45 3
## Homemaker 8 119 8
## other 14 229 15
## Services 9 118 9
## Teacher 3 61 8
##
## $father_work
##
## col High Low Moderate
## Health 1 20 2
## Homemaker 0 39 3
## other 17 329 21
## Services 14 150 17
## Teacher 2 34 0
##
## $reason_school_choice
##
## col High Low Moderate
## Course Preference 13 258 14
## Near Home 10 127 12
## Other 7 56 9
## Reputation 4 131 8
##
## $legal_responsibility
##
## col High Low Moderate
## Father 8 133 12
## Mother 20 408 27
## Other 6 31 4
##
## $commute_time
##
## col High Low Moderate
## 15 to 30 min 11 189 13
## 30 min to 1h 4 42 8
## More than 1h 3 12 1
## Up to 15 min 16 329 21
##
## $weekly_study_time
##
## col High Low Moderate
## 2 to 5h 14 278 13
## 5 to 10h 2 94 1
## More than 10h 2 29 4
## Up to 2h 16 171 25
##
## $extra_educational_support
##
## col High Low Moderate
## No 30 510 41
## Yes 4 62 2
##
## $parental_educational_support
##
## col High Low Moderate
## No 12 215 24
## Yes 22 357 19
##
## $private_tutoring
##
## col High Low Moderate
## No 31 539 40
## Yes 3 33 3
##
## $extracurricular_activities
##
## col High Low Moderate
## No 14 296 24
## Yes 20 276 19
##
## $attended_daycare
##
## col High Low Moderate
## No 10 109 9
## Yes 24 463 34
##
## $desire_graduate_education
##
## col High Low Moderate
## No 8 55 6
## Yes 26 517 37
##
## $has_internet
##
## col High Low Moderate
## No 5 135 11
## Yes 29 437 32
##
## $is_dating
##
## col High Low Moderate
## No 14 364 32
## Yes 20 208 11
##
## $good_family_relationship
##
## col High Low Moderate
## Fair 6 92 3
## Good 24 440 33
## Poor 4 40 7
##
## $free_time_after_school
##
## col High Low Moderate
## High 17 203 26
## Low 7 135 10
## Moderate 10 234 7
##
## $time_with_friends
##
## col High Low Moderate
## High 23 202 26
## Low 4 182 7
## Moderate 7 188 10
##
## $alcohol_weekdays
##
## col High Low Moderate
## High 34 0 0
## Low 0 572 0
## Moderate 0 0 43
##
## $alcohol_weekends
##
## col High Low Moderate
## High 26 74 32
## Low 4 391 2
## Moderate 4 107 9
##
## $health_status
##
## col High Low Moderate
## Fair 9 110 5
## Good 19 310 28
## Poor 6 152 10
Prueba Chi-Cuadrado
# aplicar la prueba de independencia de chi-cuadrado a cada tabla de contingencia
chi_cuadrado_1 <- lapply(tablas_contingencias_1, chisq.test)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
chi_cuadrado_1
## $school
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 4.0191, df = 2, p-value = 0.134
##
##
## $gender
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 39.436, df = 2, p-value = 2.733e-09
##
##
## $housing_type
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 4.1675, df = 2, p-value = 0.1245
##
##
## $family_size
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 2.3978, df = 2, p-value = 0.3015
##
##
## $parental_status
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 0.49529, df = 2, p-value = 0.7806
##
##
## $mother_education
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 9.3106, df = 8, p-value = 0.3168
##
##
## $father_education
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 4.1102, df = 8, p-value = 0.847
##
##
## $mother_work
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 6.1653, df = 8, p-value = 0.6287
##
##
## $father_work
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 10.683, df = 8, p-value = 0.2203
##
##
## $reason_school_choice
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 12.356, df = 6, p-value = 0.05448
##
##
## $legal_responsibility
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 9.6798, df = 4, p-value = 0.04618
##
##
## $commute_time
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 13.687, df = 6, p-value = 0.03333
##
##
## $weekly_study_time
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 23.815, df = 6, p-value = 0.0005648
##
##
## $extra_educational_support
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1.696, df = 2, p-value = 0.4283
##
##
## $parental_educational_support
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 5.7747, df = 2, p-value = 0.05572
##
##
## $private_tutoring
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 0.60637, df = 2, p-value = 0.7385
##
##
## $extracurricular_activities
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1.7848, df = 2, p-value = 0.4097
##
##
## $attended_daycare
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 2.2162, df = 2, p-value = 0.3302
##
##
## $desire_graduate_education
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 7.0739, df = 2, p-value = 0.0291
##
##
## $has_internet
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1.5606, df = 2, p-value = 0.4583
##
##
## $is_dating
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 9.4615, df = 2, p-value = 0.00882
##
##
## $good_family_relationship
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 7.4854, df = 4, p-value = 0.1124
##
##
## $free_time_after_school
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 15.161, df = 4, p-value = 0.004379
##
##
## $time_with_friends
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 24.017, df = 4, p-value = 7.925e-05
##
##
## $alcohol_weekdays
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1298, df = 4, p-value < 2.2e-16
##
##
## $alcohol_weekends
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 171.75, df = 4, p-value < 2.2e-16
##
##
## $health_status
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 4.2113, df = 4, p-value = 0.3782
Prueba Chi-Cuadrado
# aplicar la prueba de independencia de chi-cuadrado a cada tabla de contingencia
chi_cuadrado_2 <- lapply(tablas_contingencias_2, chisq.test)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
chi_cuadrado_2
## $school
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 0.085819, df = 2, p-value = 0.958
##
##
## $gender
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 69.654, df = 2, p-value = 7.495e-16
##
##
## $housing_type
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 0.14167, df = 2, p-value = 0.9316
##
##
## $family_size
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 3.171, df = 2, p-value = 0.2049
##
##
## $parental_status
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 3.8628, df = 2, p-value = 0.1449
##
##
## $mother_education
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 10.255, df = 8, p-value = 0.2476
##
##
## $father_education
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 9.9856, df = 8, p-value = 0.266
##
##
## $mother_work
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 7.0431, df = 8, p-value = 0.532
##
##
## $father_work
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 18.679, df = 8, p-value = 0.01668
##
##
## $reason_school_choice
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 6.8145, df = 6, p-value = 0.3383
##
##
## $legal_responsibility
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1.0722, df = 4, p-value = 0.8987
##
##
## $commute_time
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 8.0027, df = 6, p-value = 0.2379
##
##
## $weekly_study_time
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 37.497, df = 6, p-value = 1.409e-06
##
##
## $extra_educational_support
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 2.4215, df = 2, p-value = 0.298
##
##
## $parental_educational_support
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 6.8137, df = 2, p-value = 0.03314
##
##
## $private_tutoring
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 3.0945, df = 2, p-value = 0.2128
##
##
## $extracurricular_activities
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1.8354, df = 2, p-value = 0.3994
##
##
## $attended_daycare
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 3.1753, df = 2, p-value = 0.2044
##
##
## $desire_graduate_education
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 4.3507, df = 2, p-value = 0.1136
##
##
## $has_internet
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 2.7657, df = 2, p-value = 0.2509
##
##
## $is_dating
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 0.53281, df = 2, p-value = 0.7661
##
##
## $good_family_relationship
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 5.057, df = 4, p-value = 0.2815
##
##
## $free_time_after_school
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 18.865, df = 4, p-value = 0.0008356
##
##
## $time_with_friends
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 94.004, df = 4, p-value < 2.2e-16
##
##
## $alcohol_weekdays
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 171.75, df = 4, p-value < 2.2e-16
##
##
## $alcohol_weekends
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1298, df = 4, p-value < 2.2e-16
##
##
## $health_status
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 7.4814, df = 4, p-value = 0.1125
# asegurarse que los datos sean factores
base_datos_str[] <- lapply(base_datos_str, as.factor)
# generar la V de Cramer para cada columna con alcohol_weekdays
v_cramer_entre_semana <- sapply(base_datos_str, function(col) {
cramerV(base_datos_str$alcohol_weekdays, col, ci = FALSE, conf = 0.95, type = "perc", R = 1000, histogram = FALSE, digits = 4, bias.correct = FALSE, reportIncomplete = FALSE, verbose = FALSE, tolerance = 1e-16)
})
print(v_cramer_entre_semana)
## school.Cramer V gender.Cramer V
## 0.07869 0.24650
## housing_type.Cramer V family_size.Cramer V
## 0.08013 0.06078
## parental_status.Cramer V mother_education.Cramer V
## 0.02763 0.08469
## father_education.Cramer V mother_work.Cramer V
## 0.05627 0.06892
## father_work.Cramer V reason_school_choice.Cramer V
## 0.09072 0.09757
## legal_responsibility.Cramer V commute_time.Cramer V
## 0.08636 0.10270
## weekly_study_time.Cramer V extra_educational_support.Cramer V
## 0.13550 0.05112
## parental_educational_support.Cramer V private_tutoring.Cramer V
## 0.09433 0.03057
## extracurricular_activities.Cramer V attended_daycare.Cramer V
## 0.05244 0.05844
## desire_graduate_education.Cramer V has_internet.Cramer V
## 0.10440 0.04904
## is_dating.Cramer V good_family_relationship.Cramer V
## 0.12070 0.07594
## free_time_after_school.Cramer V time_with_friends.Cramer V
## 0.10810 0.13600
## alcohol_weekdays.Cramer V alcohol_weekends.Cramer V
## 1.00000 0.36380
## health_status.Cramer V
## 0.05696
# generar la V de Cramer para cada columna con alcohol_weekends
v_cramer_fin_semana <- sapply(base_datos_str, function(col) {
cramerV(base_datos_str$alcohol_weekends, col, ci = FALSE, conf = 0.95, type = "perc", R = 1000, histogram = FALSE, digits = 4, bias.correct = FALSE, reportIncomplete = FALSE, verbose = FALSE, tolerance = 1e-16)
})
print(v_cramer_fin_semana)
## school.Cramer V gender.Cramer V
## 0.01150 0.32760
## housing_type.Cramer V family_size.Cramer V
## 0.01477 0.06990
## parental_status.Cramer V mother_education.Cramer V
## 0.07715 0.08889
## father_education.Cramer V mother_work.Cramer V
## 0.08771 0.07366
## father_work.Cramer V reason_school_choice.Cramer V
## 0.12000 0.07246
## legal_responsibility.Cramer V commute_time.Cramer V
## 0.02874 0.07852
## weekly_study_time.Cramer V extra_educational_support.Cramer V
## 0.17000 0.06108
## parental_educational_support.Cramer V private_tutoring.Cramer V
## 0.10250 0.06905
## extracurricular_activities.Cramer V attended_daycare.Cramer V
## 0.05318 0.06995
## desire_graduate_education.Cramer V has_internet.Cramer V
## 0.08188 0.06528
## is_dating.Cramer V good_family_relationship.Cramer V
## 0.02865 0.06242
## free_time_after_school.Cramer V time_with_friends.Cramer V
## 0.12060 0.26910
## alcohol_weekdays.Cramer V alcohol_weekends.Cramer V
## 0.36380 1.00000
## health_status.Cramer V
## 0.07592